# Libraries
library(tidyverse)
library(hrbrthemes)
library(viridis)
library(patchwork)
# Dataset:
a <- data.frame( x=rnorm(20000, 10, 1.2), y=rnorm(20000, 10, 1.2), group=rep("A",20000))
b <- data.frame( x=rnorm(20000, 14.5, 1.2), y=rnorm(20000, 14.5, 1.2), group=rep("B",20000))
c <- data.frame( x=rnorm(20000, 9.5, 1.5), y=rnorm(20000, 15.5, 1.5), group=rep("C",20000))
data <- do.call(rbind, list(a,b,c))
data %>%
ggplot( aes(x=x, y=y)) +
geom_point(color="#69b3a2", size=2) +
theme_ipsum() +
theme(
legend.position="none"
)
Overplotting is a common issue in dataviz. When your dataset is big, dots of your scatterplot tend to overlap, making the graphic unreadable.
This issue is illustrated in the scatterplot beside. A first look might lead to the conclusion that there is no obvious relationships between X and Y. We will see below how wrong it is.
In this post, I suggest 10 workarounds to avoid overplotting.
The easiest workaround is probably to reduce dot size. Depending on the quantity of overlap you have, it can give a really satisfying result. Here it appears clearly that 3 clusters are present, what whas hidden on the previous figure.
# Plot with small dot size
data %>%
ggplot( aes(x=x, y=y)) +
geom_point(color="#69b3a2", size=0.02) +
theme_ipsum() +
theme(
legend.position="none"
)In combination with decreasing dot size, using transparency also allows to reveal patterns when you encounter overplotting issues:
# Plot with small dot size
data %>%
ggplot( aes(x=x, y=y)) +
geom_point(color="#69b3a2", size=2, alpha=0.01) +
theme_ipsum() +
theme(
legend.position="none"
)The 2d density chart basically counts the number of observations within a particular area of the 2D space and represent this count by a color. If you divide the space by several squares you get a 2D histogram. If you use hexagons you get a hexbin plot. You can also calculate Density estimate and represent 2D density plots or Contour plots. You can read more about this on the dedicated page of data-to-viz.com.
ggplot(data, aes(x=x, y=y) ) +
stat_density_2d(aes(fill = ..density..), geom = "raster", contour = FALSE) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
scale_fill_viridis() +
theme(
legend.position='none'
)Sometimes less is more. Plotting only a fraction of your data (5% here) greatly impact the computing time and help to avoid overplotting:
# Plot with small dot size
data %>%
sample_frac(0.05) %>%
ggplot( aes(x=x, y=y)) +
geom_point(color="#69b3a2", size=2) +
theme_ipsum() +
theme(
legend.position="none"
)Another way to reduce the complexity on the graphic is to highlight a specific group. (This suggest that you have a grouping variable in your dataset).
# Plot with small dot size
data %>%
ggplot( aes(x=x, y=y)) +
geom_point(color="grey", size=2) +
geom_point(data = data %>% filter(group=="B"), color="#69b3a2", size=2) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=12)
) +
ggtitle('Behavior of the group B') If you have a grouping variable it is highly recommended to make it appear on the graphic. In the case of overplotting, it can also help to reveal patterns.
# Plot with small dot size
data %>%
ggplot( aes(x=x, y=y, color=group)) +
geom_point( size=2, alpha=0.1) +
scale_color_viridis(discrete=TRUE) +
theme_ipsum()As soon as you have several groups in your plot, an alternative is to use faceting: the same plot is repeated, each time highlighting another group:
# Plot with small dot size
data %>%
ggplot( aes(x=x, y=y)) +
geom_point( data=data %>% select(-group), size=1, alpha=0.05, color="grey") +
geom_point( aes( color=group) , size=2, alpha=0.1) +
scale_color_viridis(discrete=TRUE) +
theme_ipsum() +
theme(
legend.position="none",
) +
facet_wrap(~group)Jittering is an option when one of the axis is qualitative (like 1, 2, 3.., see left figure). It adds or substract a random value to each data point to avoid overplotting. Note that other chart types are available in that kind of situation, like boxplot or violin plot.
# Create data
don <- data.frame(
x = rep(seq(1,5), each=1000),
y = c( rnorm(1000, 4, 2), rnorm(1000, 4, 4), rnorm(500, 2, 1), rnorm(500, 10, 2), rnorm(1000, 8, 4), rnorm(1000, 10, 4))
)
# basique plot
p1 <- don %>%
ggplot( aes(x=x, y=y)) +
geom_point( aes( color=x) , size=2, alpha=0.2) +
scale_color_viridis() +
theme_ipsum() +
theme(
legend.position="none",
)
# Plot with jitter
p2 <- don %>%
ggplot( aes(x=x, y=y)) +
geom_jitter( aes( color=x) , size=2, alpha=0.2, width=0.3) +
scale_color_viridis() +
theme_ipsum() +
theme(
legend.position="none",
)
p1 + p2As for 2d density plots, it is possible to transform the scatter plot information in a grid, and count the number of data points on each position of the grid. Then, instead of representing this number by a graduating color, the surface plot use 3d to represent dense are higher than others. In this case, the position of the 3 groups become obvious:
library(plotly)
library(MASS)
# Compute kde2d
kd <- with(data, MASS::kde2d(x, y, n = 50))
# Plot with plotly
plot_ly(x = kd$x, y = kd$y, z = kd$z) %>% add_surface()Adding marginal distribution allows to detect the distribution hidden on the overplotting parts of the graphic. You can add a boxplot, a histogram or a density plot in the marginal parts.
library(ggExtra)
# create a ggplot2 scatterplot
p <- data %>%
ggplot( aes(x=x, y=y)) +
geom_point(color="#69b3a2", size=2, alpha=0.01) +
theme_ipsum() +
theme(
legend.position="none"
)
# add marginal histograms
ggExtra::ggMarginal(p, type = "histogram")Any thoughts on this? Found any mistake? Disagree? Please drop me a word on twitter or in the comment section below:
A work by Yan Holtz for data-to-viz.com